SpeakEasy code from Gaiteri et al, 2015.

net_dir = "/pastel/projects/speakeasy_dlpfc/SpeakEasy_net_MF/"
expression_dir = "/pastel/projects/speakeasy_dlpfc/"

Data input

load(paste0(expression_dir, "exprdata_byregion.Rdata")) 
exprData4net = as.data.frame(exprData_MF) 
res.pca = prcomp(t(exprData4net)) 

autoplot(res.pca, data = as.data.frame(metadata_MF), colour = 'Batch') +
  scale_colour_viridis_d() +
  theme_classic() +
  easy_remove_legend() # Legend removed due to the high number of labels. 

Parameters for the network

print(Sys.setenv(MATLAB_R_HOME = "/usr/local/MATLAB/MATLAB_Runtime/v901")) ## Check Matlab path 

[1] TRUE

data=t(exprData_MF) # transpose to get by gene not by samples
writeMat(paste0(net_dir, "data.mat"), data=data)

data_loc = paste0(net_dir, "data.mat")
n_subclust = 3 # number of times the algorithm will run to check for subclusters. Default: 3 
min_clust = 30 # min number of genes by cluster
n_replicate = 100 # Default: 100
n_step = 50 # optimization steps
out_loc = net_dir
cor_type = "Pearson"
frac_sparse = 1 # top 20% (0.2) of the associations (gene expression, 100% is good. 
# For SNP or DNA methylation you need to set up a smaller threshold of frac_sparse.)

SpeakEasy function

Heavy chunck!

speakeasy <- function(data_loc,n_subclust,min_clust,n_replicate,n_step,frac_sparse,cor_type,out_loc){
  cmd = paste0("/pastel/projects/speakeasy_dlpfc/SpeakEasy_net_MF/run_SpeakEasy/for_testing/run_run_SpeakEasy.sh $MATLAB_R_HOME ",data_loc," ",
               n_subclust," ",min_clust," ",
               n_replicate," ",n_step," ",
               frac_sparse," ",cor_type," ",
               out_loc)
  system(cmd)
}

speakeasy(data_loc, n_subclust, min_clust, n_replicate, n_step, frac_sparse, cor_type, out_loc) # The output is a file

Clusters with gene_names

cluster_codes = readMat(paste0(net_dir, "cluster_codes.mat")) # 3 tables because of the n_subclust parameter. 

cluster_codes_df = as.data.frame(cluster_codes$cluster.codes)
cluster_codes_df = cluster_codes_df[,c(1,2,4,6)]
colnames(cluster_codes_df) = c("ensembl","cluster_lv1","cluster_lv2", "cluster_lv3")
cluster_codes_df$ensembl = colnames(data)

# Get the gene_symbol 
gene_names1 = as.data.frame(gene_names)
cluster_codes_df_names = merge(cluster_codes_df, gene_names1, by.x = "ensembl", by.y = "ensgene")
cluster_codes_df_names$probeid = NULL
write.table(cluster_codes_df_names, file = paste0(net_dir, "geneBycluster.txt"), sep = "\t", quote = F, row.names = F)

createDT(cluster_codes_df_names)

Number of genes by cluster

Clusters level 1

count1 = as.data.frame(table(cluster_codes_df$cluster_lv1))
colnames(count1) = c("cluster", "n_nodes")
total_nodes = sum(count1$n_nodes)
nodes_in_cluster = sum(count1$n_nodes[count1$n_nodes > min_clust])
message(paste0("Number of clusters with at least 30 nodes: "), length(count1$cluster[count1$n_nodes > min_clust])) # 30 in this case    
## Number of clusters with at least 30 nodes: 3
message(paste0("Number of genes assigned in clusters with at least 30 nodes: "), nodes_in_cluster, ". Percentage: ", (nodes_in_cluster/total_nodes)*100, "% of the genes are assigned to a cluster.")
## Number of genes assigned in clusters with at least 30 nodes: 17309. Percentage: 100% of the genes are assigned to a cluster.
createDT(count1) 

Clusters level 2

count2 = as.data.frame(table(cluster_codes_df$cluster_lv2))
colnames(count2) = c("cluster", "n_nodes")
total_nodes = sum(count2$n_nodes)
nodes_in_cluster = sum(count2$n_nodes[count2$n_nodes > min_clust])
message(paste0("Number of clusters with at least 30 nodes: "), length(count2$cluster[count2$n_nodes > min_clust])) # 30 in this case    
## Number of clusters with at least 30 nodes: 10
message(paste0("Number of genes assigned in clusters with at least 30 nodes: "), nodes_in_cluster, ". Percentage: ", (nodes_in_cluster/total_nodes)*100, "% of the genes are assigned to a cluster.")
## Number of genes assigned in clusters with at least 30 nodes: 17307. Percentage: 99.9884453174649% of the genes are assigned to a cluster.
createDT(count2) 

Clusters level 3

count3 = as.data.frame(table(cluster_codes_df$cluster_lv3))
colnames(count3) = c("cluster", "n_nodes")
total_nodes = sum(count3$n_nodes)
nodes_in_cluster = sum(count3$n_nodes[count3$n_nodes > min_clust])
message(paste0("Number of clusters with at least 30 nodes: "), length(count3$cluster[count3$n_nodes > min_clust])) # 30 in this case    
## Number of clusters with at least 30 nodes: 34
message(paste0("Number of genes assigned in clusters with at least 30 nodes: "), nodes_in_cluster, ". Percentage: ", (nodes_in_cluster/total_nodes)*100, "% of the genes are assigned to a cluster.")
## Number of genes assigned in clusters with at least 30 nodes: 17258. Percentage: 99.705355595355% of the genes are assigned to a cluster.
createDT(count3) 

Session info

sessionInfo()

R version 4.1.2 (2021-11-01) Platform: x86_64-pc-linux-gnu (64-bit) Running under: CentOS Linux 8

Matrix products: default BLAS/LAPACK: /usr/lib64/libopenblasp-r0.3.12.so

locale: [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C

attached base packages: [1] stats4 stats graphics grDevices utils datasets methods
[8] base

other attached packages: [1] S4Vectors_0.32.3 BiocGenerics_0.40.0 ggeasy_0.1.3
[4] readxl_1.3.1 kableExtra_1.3.4 R.matlab_3.6.2
[7] forcats_0.5.1 stringr_1.4.0 dplyr_1.0.8
[10] purrr_0.3.4 readr_2.1.2 tidyr_1.2.0
[13] tibble_3.1.6 tidyverse_1.3.1 limma_3.50.1
[16] ggfortify_0.4.14 ggplot2_3.3.5

loaded via a namespace (and not attached): [1] httr_1.4.2 sass_0.4.0 jsonlite_1.7.3 viridisLite_0.4.0 [5] R.utils_2.11.0 modelr_0.1.8 bslib_0.3.1 assertthat_0.2.1 [9] highr_0.9 cellranger_1.1.0 yaml_2.3.5 pillar_1.7.0
[13] backports_1.4.1 glue_1.6.1 digest_0.6.29 rvest_1.0.2
[17] colorspace_2.0-3 htmltools_0.5.2 R.oo_1.24.0 pkgconfig_2.0.3
[21] broom_0.7.12 haven_2.4.3 scales_1.1.1 webshot_0.5.2
[25] svglite_2.1.0 tzdb_0.2.0 farver_2.1.0 generics_0.1.2
[29] DT_0.20 ellipsis_0.3.2 withr_2.4.3 cli_3.2.0
[33] magrittr_2.0.2 crayon_1.5.0 evaluate_0.15 R.methodsS3_1.8.1 [37] fs_1.5.2 fansi_1.0.2 xml2_1.3.3 tools_4.1.2
[41] hms_1.1.1 lifecycle_1.0.1 munsell_0.5.0 reprex_2.0.1
[45] compiler_4.1.2 jquerylib_0.1.4 systemfonts_1.0.4 rlang_1.0.1
[49] grid_4.1.2 rstudioapi_0.13 htmlwidgets_1.5.4 crosstalk_1.2.0
[53] labeling_0.4.2 rmarkdown_2.11 gtable_0.3.0 DBI_1.1.2
[57] R6_2.5.1 gridExtra_2.3 lubridate_1.8.0 knitr_1.37
[61] fastmap_1.1.0 utf8_1.2.2 stringi_1.7.6 Rcpp_1.0.8
[65] vctrs_0.3.8 dbplyr_2.1.1 tidyselect_1.1.2 xfun_0.29